feature_description_original <- readxl::read_excel(
"data/feature_description.xlsx")
feature_description_original
customer_segmentation_raw <- read_raw_customer_data("data/customer_segmentation_test.csv")
## i Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
skimr::skim(customer_segmentation_raw)
| Name | customer_segmentation_raw |
| Number of rows | 406734 |
| Number of columns | 21 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| Date | 3 |
| factor | 6 |
| numeric | 11 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Postcode | 9176 | 0.98 | 1 | 9 | 0 | 2982 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| LastPaymentDate | 0 | 1.00 | 2015-01-03 | 2020-02-13 | 2018-12-06 | 1361 |
| PenultimatePaymentDate | 44699 | 0.89 | 1995-12-31 | 2020-02-05 | 2017-04-12 | 5376 |
| DateOfBirth | 155491 | 0.62 | 1902-04-21 | 2015-03-30 | 1948-03-09 | 25514 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Gender | 0 | 1 | FALSE | 3 | fem: 203904, mal: 183467, fam: 19363 |
| MERCHANDISE2015 | 0 | 1 | FALSE | 2 | 0: 401845, 1: 4889 |
| MERCHANDISE2016 | 0 | 1 | FALSE | 2 | 0: 401585, 1: 5149 |
| MERCHANDISE2019 | 0 | 1 | FALSE | 2 | 0: 401470, 1: 5264 |
| MERCHANDISE2017 | 0 | 1 | FALSE | 2 | 0: 402378, 1: 4356 |
| MERCHANDISE2018 | 0 | 1 | FALSE | 2 | 0: 401470, 1: 5264 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| COUNT2015 | 0 | 1 | 2.52 | 4.00 | 0 | 0.0 | 2.0 | 2.0 | 96.0 | ▇▁▁▁▁ |
| SUM2015 | 0 | 1 | 42.44 | 850.19 | 0 | 0.0 | 15.0 | 45.0 | 388113.6 | ▇▁▁▁▁ |
| COUNT2016 | 0 | 1 | 1.22 | 2.02 | 0 | 0.0 | 1.0 | 1.0 | 178.0 | ▇▁▁▁▁ |
| SUM2016 | 0 | 1 | 50.93 | 591.05 | 0 | 0.0 | 16.0 | 50.0 | 295599.8 | ▇▁▁▁▁ |
| COUNT2017 | 0 | 1 | 1.06 | 1.91 | 0 | 0.0 | 0.0 | 1.0 | 95.0 | ▇▁▁▁▁ |
| SUM2017 | 0 | 1 | 24.78 | 572.90 | 0 | 0.0 | 0.0 | 20.0 | 207134.7 | ▇▁▁▁▁ |
| COUNT2018 | 0 | 1 | 1.00 | 1.87 | 0 | 0.0 | 0.0 | 1.0 | 49.0 | ▇▁▁▁▁ |
| SUM2018 | 0 | 1 | 20.64 | 1552.60 | 0 | 0.0 | 0.0 | 15.0 | 911146.5 | ▇▁▁▁▁ |
| COUNT2019 | 0 | 1 | 0.97 | 1.79 | 0 | 0.0 | 0.0 | 1.0 | 31.0 | ▇▁▁▁▁ |
| SUM2019 | 0 | 1 | 46.44 | 3999.80 | 0 | 0.0 | 0.0 | 30.0 | 2400000.0 | ▇▁▁▁▁ |
| ID | 0 | 1 | 203367.50 | 117414.14 | 1 | 101684.2 | 203367.5 | 305050.8 | 406734.0 | ▇▇▇▇▇ |
zip_code_list <- readxl::read_excel("data/PLZ_Verzeichnis-20211201.xlsx")
zip_code_list
customer_segmentation_with_zip <- enrich_with_postal_info(
customer_segmentation_raw,
"data/PLZ_Verzeichnis-20211201.xlsx"
)
customer_segmentation_with_zip
customer_segmentation_first_prepro <- apply_feature_engineering(customer_segmentation_with_zip)
skimr::skim(customer_segmentation_first_prepro)
| Name | customer_segmentation_fir… |
| Number of rows | 396694 |
| Number of columns | 37 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| Date | 1 |
| factor | 13 |
| numeric | 22 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Ort | 0 | 1 | 2 | 40 | 0 | 2178 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| LastPaymentDate | 0 | 1 | 2015-01-03 | 2020-02-13 | 2018-12-11 | 1355 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Gender | 0 | 1.00 | FALSE | 3 | fem: 199545, mal: 179215, fam: 17934 |
| Postcode | 0 | 1.00 | FALSE | 2249 | 122: 6776, 121: 6208, 110: 5941, 502: 5383 |
| MERCHANDISE2015 | 0 | 1.00 | FALSE | 2 | 0: 391818, 1: 4876 |
| MERCHANDISE2016 | 0 | 1.00 | FALSE | 2 | 0: 391552, 1: 5142 |
| MERCHANDISE2019 | 0 | 1.00 | FALSE | 2 | 0: 391460, 1: 5234 |
| MERCHANDISE2017 | 0 | 1.00 | FALSE | 2 | 0: 392339, 1: 4355 |
| MERCHANDISE2018 | 0 | 1.00 | FALSE | 2 | 0: 391460, 1: 5234 |
| Bundesland | 0 | 1.00 | FALSE | 9 | N: 88175, W: 70706, O: 66082, St: 57348 |
| generation_moniker | 146208 | 0.63 | FALSE | 5 | sil: 110508, boo: 102068, x: 33020, mil: 4734 |
| LastPaymentMONTH | 0 | 1.00 | FALSE | 12 | 12: 119035, 11: 66379, 1: 45775, 10: 42275 |
| PenultimatePaymentMONTH | 37875 | 0.90 | FALSE | 12 | 12: 91203, 11: 56900, 10: 42674, 1: 27463 |
| XMAS_donor | 0 | 1.00 | FALSE | 3 | unl: 165505, may: 119746, yes: 111443 |
| merchandise_any | 0 | 1.00 | FALSE | 2 | 0: 377620, 1: 19074 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| COUNT2015 | 0 | 1.00 | 2.56 | 4.03 | 0.00 | 0.00 | 2.00 | 4.00 | 96.0 | ▇▁▁▁▁ |
| SUM2015 | 0 | 1.00 | 41.12 | 724.36 | 0.00 | 0.00 | 15.00 | 45.00 | 388113.6 | ▇▁▁▁▁ |
| COUNT2016 | 0 | 1.00 | 1.24 | 2.03 | 0.00 | 0.00 | 1.00 | 1.00 | 178.0 | ▇▁▁▁▁ |
| SUM2016 | 0 | 1.00 | 51.20 | 596.95 | 0.00 | 0.00 | 20.00 | 50.00 | 295599.8 | ▇▁▁▁▁ |
| COUNT2017 | 0 | 1.00 | 1.08 | 1.92 | 0.00 | 0.00 | 0.00 | 1.00 | 95.0 | ▇▁▁▁▁ |
| SUM2017 | 0 | 1.00 | 24.45 | 484.85 | 0.00 | 0.00 | 0.00 | 20.00 | 207134.7 | ▇▁▁▁▁ |
| COUNT2018 | 0 | 1.00 | 1.02 | 1.88 | 0.00 | 0.00 | 0.00 | 1.00 | 49.0 | ▇▁▁▁▁ |
| SUM2018 | 0 | 1.00 | 20.76 | 1570.91 | 0.00 | 0.00 | 0.00 | 15.00 | 911146.5 | ▇▁▁▁▁ |
| COUNT2019 | 0 | 1.00 | 0.98 | 1.80 | 0.00 | 0.00 | 0.00 | 1.00 | 31.0 | ▇▁▁▁▁ |
| SUM2019 | 0 | 1.00 | 46.90 | 4049.95 | 0.00 | 0.00 | 0.00 | 30.00 | 2400000.0 | ▇▁▁▁▁ |
| ID | 0 | 1.00 | 205024.74 | 116888.18 | 2073.00 | 103150.25 | 206597.50 | 306127.75 | 406734.0 | ▇▇▇▇▇ |
| year_born | 146204 | 0.63 | 1949.25 | 14.01 | 1902.00 | 1939.00 | 1948.00 | 1959.00 | 2015.0 | ▁▇▇▂▁ |
| age_at_last_donation | 146204 | 0.63 | 68.33 | 14.00 | 0.00 | 59.00 | 70.00 | 79.00 | 117.0 | ▁▁▇▇▁ |
| COUNTtotal | 0 | 1.00 | 6.87 | 9.93 | 1.00 | 2.00 | 3.00 | 7.00 | 273.0 | ▇▁▁▁▁ |
| SUMtotal | 0 | 1.00 | 184.43 | 4898.70 | 0.01 | 30.00 | 65.00 | 160.00 | 2400225.0 | ▇▁▁▁▁ |
| SUMaverage | 0 | 1.00 | 36.08 | 1530.61 | 0.01 | 11.25 | 17.34 | 29.42 | 750000.0 | ▇▁▁▁▁ |
| COUNTaverage | 0 | 1.00 | 1.37 | 1.99 | 0.20 | 0.40 | 0.60 | 1.40 | 54.6 | ▇▁▁▁▁ |
| LastPaymentYEAR | 0 | 1.00 | 2017.78 | 1.53 | 2015.00 | 2016.00 | 2018.00 | 2019.00 | 2020.0 | ▅▂▃▇▂ |
| PenultimatePaymentYEAR | 37875 | 0.90 | 2015.72 | 3.91 | 1995.00 | 2015.00 | 2017.00 | 2018.00 | 2020.0 | ▁▁▁▃▇ |
| donation_interval | 37875 | 0.90 | 773.66 | 1215.88 | 1.00 | 123.00 | 354.00 | 762.00 | 8766.0 | ▇▁▁▁▁ |
| days_since_last_payment | 0 | 1.00 | -1293.24 | 561.24 | -2540.00 | -1814.00 | -1102.00 | -762.00 | -673.0 | ▂▂▂▃▇ |
| num_of_donation_years | 0 | 1.00 | 2.50 | 1.49 | 1.00 | 1.00 | 2.00 | 4.00 | 5.0 | ▇▅▃▂▃ |
# Maybe it's a good idea to take out all the NAs for age. Obviously we lose a lot of rows, but 251000 left still seems plenty to me.
customer_segmentation_complete <- customer_segmentation_first_prepro %>% drop_na(year_born)
customer_segmentation_complete
ggplot(customer_segmentation_first_prepro, aes(XMAS_donor)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(num_of_donation_years)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro %>% filter(SUMtotal > 0 & SUMtotal < 5000), aes(x = SUMtotal)) +
geom_histogram(binwidth = 100) +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(LastPaymentMONTH)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(PenultimatePaymentMONTH)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(num_of_donation_years)) +
geom_bar() +
facet_wrap(~generation_moniker)
ggplot(customer_segmentation_first_prepro %>% drop_na(age_at_last_donation), aes(age_at_last_donation)) +
geom_histogram(binwidth = 5)
ggplot(customer_segmentation_first_prepro %>% filter(COUNTtotal < (7 * 6)), aes(COUNTtotal)) +
geom_histogram(binwidth = 1)
ggplot(customer_segmentation_first_prepro %>% drop_na(donation_interval) %>% filter(donation_interval < (360 * 5)), aes(donation_interval)) +
geom_histogram(binwidth = 30)
ggplot(customer_segmentation_first_prepro, aes(days_since_last_payment)) +
geom_histogram(binwidth = 30)
mean_total_sum <- customer_segmentation_first_prepro$SUMtotal %>% mean(na.rm = TRUE)
sd_total_sum <- customer_segmentation_first_prepro$SUMtotal %>% sd(na.rm = TRUE)
ggplot(customer_segmentation_first_prepro %>% drop_na(year_born) %>% filter(SUMtotal < (mean_total_sum + sd_total_sum * 6)), aes(year_born, SUMtotal)) +
geom_point(alpha = 1 / 10)
donors_per_state <- customer_segmentation_first_prepro %>%
select(Bundesland) %>%
group_by(Bundesland) %>%
count() %>%
ungroup()
ggplot(donors_per_state, aes(Bundesland, n)) +
geom_col()
# taken from https://de.statista.com/statistik/daten/studie/75396/umfrage/entwicklung-der-bevoelkerung-in-oesterreich-nach-bundesland-seit-1996/
pop_vienna <- 1921153
pop_lower_austria <- 1691040
pop_upper_austria <- 1495756
pop_styria <- 1247159
pop_tyrol <- 760161
pop_carithia <- 562230
pop_salzburg <- 560643
pop_vorarlberg <- 399164
pop_burgenland <- 296040
donors_per_state_per_100_000_inhabitants <- donors_per_state %>%
mutate(
n = case_when(
Bundesland == "B" ~ n / pop_burgenland * 100000,
Bundesland == "K" ~ n / pop_carithia * 100000,
Bundesland == "N" ~ n / pop_lower_austria * 100000,
Bundesland == "O" ~ n / pop_upper_austria * 100000,
Bundesland == "Sa" ~ n / pop_salzburg * 100000,
Bundesland == "St" ~ n / pop_styria * 100000,
Bundesland == "T" ~ n / pop_tyrol * 100000,
Bundesland == "V" ~ n / pop_vorarlberg * 100000,
Bundesland == "W" ~ n / pop_vienna * 100000
)
)
ggplot(donors_per_state_per_100_000_inhabitants, aes(Bundesland, n)) +
geom_col()
sums_per_state <- customer_segmentation_first_prepro %>%
group_by(Bundesland) %>%
summarize(sum_donations = sum(SUMtotal)) %>%
ungroup()
ggplot(sums_per_state, aes(Bundesland, sum_donations)) +
geom_col()
sums_per_state_per_inhabitant <- sums_per_state %>%
mutate(
sum_donations = case_when(
Bundesland == "B" ~ sum_donations / pop_burgenland,
Bundesland == "K" ~ sum_donations / pop_carithia,
Bundesland == "N" ~ sum_donations / pop_lower_austria,
Bundesland == "O" ~ sum_donations / pop_upper_austria,
Bundesland == "Sa" ~ sum_donations / pop_salzburg,
Bundesland == "St" ~ sum_donations / pop_styria,
Bundesland == "T" ~ sum_donations / pop_tyrol,
Bundesland == "V" ~ sum_donations / pop_vorarlberg,
Bundesland == "W" ~ sum_donations / pop_vienna
)
)
ggplot(sums_per_state_per_inhabitant, aes(Bundesland, sum_donations)) +
geom_col()
# taken from https://de.statista.com/statistik/daten/studie/373051/umfrage/kaufkraft-je-einwohner-in-oesterreich-nach-bundeslaendern/
kaufkraft_vienna <- 22659
kaufkraft_lower_austria <- 25615
kaufkraft_upper_austria <- 24728
kaufkraft_styria <- 23981
kaufkraft_tyrol <- 23579
kaufkraft_carithia <- 23833
kaufkraft_salzburg <- 24685
kaufkraft_vorarlberg <- 25535
kaufkraft_burgenland <- 24919
sums_per_state_per_inhabitant_adjusted <- sums_per_state_per_inhabitant %>%
mutate(
sum_donations = case_when(
Bundesland == "B" ~ sum_donations * (1 / (kaufkraft_burgenland / kaufkraft_lower_austria)),
Bundesland == "K" ~ sum_donations * (1 / (kaufkraft_carithia / kaufkraft_lower_austria)),
Bundesland == "N" ~ sum_donations,
Bundesland == "O" ~ sum_donations * (1 / (kaufkraft_upper_austria / kaufkraft_lower_austria)),
Bundesland == "Sa" ~ sum_donations * (1 / (kaufkraft_salzburg / kaufkraft_lower_austria)),
Bundesland == "St" ~ sum_donations * (1 / (kaufkraft_styria / kaufkraft_lower_austria)),
Bundesland == "T" ~ sum_donations * (1 / (kaufkraft_tyrol / kaufkraft_lower_austria)),
Bundesland == "V" ~ sum_donations * (1 / (kaufkraft_vorarlberg / kaufkraft_lower_austria)),
Bundesland == "W" ~ sum_donations * (1 / (kaufkraft_vienna / kaufkraft_lower_austria))
)
)
ggplot(sums_per_state_per_inhabitant_adjusted, aes(Bundesland, sum_donations)) +
geom_col()
RFM segments customers according to three variabless: Recency, Frequency, Monetary Value. Using the rfm package, RFM scores can be computed either on raw transaction data (one row per transaction), or on aggregated customer data (one row per customer). For the former, the method rfm_table_order can be used, for the latter either rfm_table_customer or rfm_table_customer2. Since our dataset represents aggregated customer data, the latter should be used. It can be computed directly from the raw data upon adding the two variables SUMtotal and COUNTtotal:
rfm_scores <- customer_segmentation_raw %>%
# create new variables: total donation sum; total number of donations
mutate(SUMtotal = SUM2015 + SUM2016 + SUM2017 + SUM2018 + SUM2019,
COUNTtotal = COUNT2015 + COUNT2016 + COUNT2017 + COUNT2018 + COUNT2019,
LastPaymentDate = as.Date(LastPaymentDate)) %>%
# compute RFM scores
rfm_table_customer_2(customer_id = ID,
n_transactions = COUNTtotal,
latest_visit_date = LastPaymentDate,
total_revenue = SUMtotal,
analysis_date = reference_date)
rfm_scores
## Warning in `[<-.data.frame`(`*tmp*`, is_list, value = list(`1` =
## "<tibble[,8]>", : replacement element 1 has 1 row to replace 0 rows
## Warning in `[<-.data.frame`(`*tmp*`, is_list, value = list(`1` =
## "<tibble[,8]>", : replacement element 2 has 1 row to replace 0 rows
rfm_scores_on_prepro <- customer_segmentation_first_prepro %>%
rfm_table_customer_2(customer_id = ID,
n_transactions = COUNTtotal,
latest_visit_date = as.Date(LastPaymentDate),
total_revenue = SUMtotal,
analysis_date = reference_date)
rfm_results_on_prepro <- rfm_scores_on_prepro$rfm %>% as.data.frame()
first_prepro_with_rfm_results <- merge(x = customer_segmentation_first_prepro,
y = rfm_results_on_prepro,
by.x = "ID",
by.y = "customer_id")
first_prepro_with_rfm_results
rfm_heatmap(rfm_scores)
In the above heatmap, we can see some interesting patterns (Note: The higher the recency score, the more recent the last donation):
There are further, less obvious customer segments in the heatmap. For the sake of clarity, rather than verbally describing the segments, below we visually represent the customer segments we believe to have identified in the heatmap:
# define data frame with frequency and recency score thresholds for each segment
heatmap_segments_df <- data.frame(x = c(1, 3, 4.5, 0.5, 0.5, 2, 4),
y = c(1.5, 1.5, 1.5, 3.5, 4.5, 4, 4),
lab = c("Lost", "Loyal average donor at risk", "Don't lose",
"Newbie", "Prospects", "Loyal average donor active",
"Champ"))
# plot the customer segments
ggplot(heatmap_segments_df, aes(x, y, label = lab)) +
geom_rect(aes(xmin = 0, xmax = 2, ymin = 0, ymax = 3), fill = "red", alpha = 0.1) +
geom_rect(aes(xmin = 2, xmax = 4, ymin = 0, ymax = 3), fill = "blue", alpha = 0.1) +
geom_rect(aes(xmin = 4, xmax = 5, ymin = 0, ymax = 3), fill = "green", alpha = 0.1) +
geom_rect(aes(xmin = 0, xmax = 1, ymin = 3, ymax = 4), fill = "tomato", alpha = 0.1) +
geom_rect(aes(xmin = 0, xmax = 1, ymin = 4, ymax = 5), fill = "yellow", alpha = 0.1) +
geom_rect(aes(xmin = 1, xmax = 3, ymin = 3, ymax = 5), fill = "orange", alpha = 0.1) +
geom_rect(aes(xmin = 0, xmax = 1, ymin = 4, ymax = 5), fill = "cyan", alpha = 0.1) +
geom_rect(aes(xmin = 3, xmax = 5, ymin = 3, ymax = 5), fill = "magenta", alpha = 0.1) +
geom_text(size=3)
The rfm_segment method can be used to assign donors to a given segment based on their RFM scores. To this end, the upper and lower bounds of recency, frequency and monetary scores for each segment, as well as the respective segment names, need to be defined. However, the code below throws an error, so probably there is a bug in the definition of the lower/upper segment bounds. ToDo: Fix the bug, or remove this.
As an alternative to rfm_segment, segments can be assigned to donors with the help of hand-crafted if-else-rules. However, this segmentation is not useful, because it yields a very high number of donors belonging to the other segment (approx. 25%). The reason for this is probably the aforementioned error in the definition of the upper/lower segment bounds.
rfm_segments <- rfm_scores$rfm %>%
mutate(segment = ifelse(recency_score %in% 4:5 & frequency_score %in% 4:5 & monetary_score %in% 4:5,
"Champ",
ifelse(recency_score %in% 4:5 & frequency_score %in% 2:3 & monetary_score %in% 1:3,
"Regular avg active",
ifelse(recency_score %in% 5:5 & frequency_score %in% 1:1 & monetary_score %in% 4:5,
"Prospect",
ifelse(recency_score %in% 4:4 & frequency_score %in% 1:1 & monetary_score %in% 1:3,
"Newbie",
ifelse(recency_score %in% 1:3 & frequency_score %in% 5:5 & monetary_score %in% 4:5,
"Don't loose",
ifelse(recency_score %in% 1:3 & frequency_score %in% 3:4 & monetary_score %in% 3:4,
"Regular avg at risk",
ifelse(recency_score %in% 1:3 & frequency_score %in% 1:2 & monetary_score %in% 1:2,
"Lost", "Other"))))))))
rfm_segments %>%
ggplot(aes(segment)) +
geom_bar()
rfm_segments$segment %>%
table() %>%
prop.table() %>%
round(3) %>%
sort(decreasing = T)
## .
## Lost Other Champ Regular avg at risk
## 0.267 0.250 0.211 0.133
## Regular avg active Don't loose Newbie Prospect
## 0.068 0.048 0.021 0.002
other_peeps <- rfm_segments %>%
filter(segment == "Other") %>%
select(customer_id) %>%
unique() %>%
(function (x) x$customer_id)
first_prepro_with_rfm_results %>% filter(ID %in% other_peeps)
# these are the same categories as above, just using the first_prepro data instead of the raw data
first_prepro_with_rfm_segments <- first_prepro_with_rfm_results %>%
mutate(segment = ifelse(recency_score %in% 4:5 & frequency_score %in% 4:5 & monetary_score %in% 4:5,
"Champ",
ifelse(recency_score %in% 4:5 & frequency_score %in% 2:3 & monetary_score %in% 1:3,
"Regular avg active",
ifelse(recency_score %in% 5:5 & frequency_score %in% 1:1 & monetary_score %in% 4:5,
"Prospect",
ifelse(recency_score %in% 4:4 & frequency_score %in% 1:1 & monetary_score %in% 1:3,
"Newbie",
ifelse(recency_score %in% 1:3 & frequency_score %in% 5:5 & monetary_score %in% 4:5,
"Don't loose",
ifelse(recency_score %in% 1:3 & frequency_score %in% 3:4 & monetary_score %in% 3:4,
"Regular avg at risk",
ifelse(recency_score %in% 1:3 & frequency_score %in% 1:2 & monetary_score %in% 1:2,
"Lost", "Other"))))))))
first_prepro_with_rfm_segments
As assumed, We’re indeed not covering everything here. E.g. somebody with recency score 4 and frequency score 1 is automatically classified as “other”, regardless of monetary value. But that person could easily be a “Prospect” or “Newbie”. It might therefore be wise to use the bounds recommended by introductions to rfm.
first_prepro_with_rfm_segments %>% filter(segment == "Other") %>%
ggplot(aes(frequency_score, recency_score)) +
geom_tile(aes(fill = monetary_score), colour = "white") +
scale_fill_distiller(palette = "PuBu", direction = +1) +
labs(title="heatmap only on those classified as OTHER in Michael's first try") +
theme_minimal()
The above heatmap shows that we e.g. missed a lot of “big donors” in the first attempt.
To remedy the faulty segmentation shown above, we resort to the customer segments (and the respective RFM score thresholds) presented in class (see slide deck of first class, p. 82 as well as here. We use this mainstream segmentation as our baseline:
# define name of each segment
segment_names_baseline <- c("Champions", "Loyal Customers", "Potential Loyalist",
"New Customers", "Promising", "Need Attention", "About To Sleep",
"At Risk", "Can't Lose Them", "Lost")
# set the upper and lower bounds for recency, frequency, and monetary for each segment
recency_lower <- c(4, 2, 3, 4, 3, 2, 2, 1, 1, 1)
recency_upper <- c(5, 5, 5, 5, 4, 3, 3, 2, 1, 2)
frequency_lower <- c(4, 3, 1, 1, 1, 2, 1, 2, 4, 1)
frequency_upper <- c(5, 5, 3, 1, 1, 3, 2, 5, 5, 2)
monetary_lower <- c(4, 3, 1, 1, 1, 2, 1, 2, 4, 1)
monetary_upper <- c(5, 5, 3, 1, 1, 3, 2, 5, 5, 2)
# assign segment to each customer
rfm_segments_baseline <- rfm_segment(rfm_scores,
segment_names_baseline,
recency_lower,
recency_upper,
frequency_lower,
frequency_upper,
monetary_lower,
monetary_upper)
# inspect segment assignment
head(rfm_segments_baseline)
# NOW ON PREPRO DATA and using numeric customer_id
# assign segment to each customer
rfm_segments_baseline_on_prepro <- rfm_segment(rfm_scores_on_prepro,
segment_names_baseline,
recency_lower,
recency_upper,
frequency_lower,
frequency_upper,
monetary_lower,
monetary_upper)
# merge with prepro_data
rfm_results_baseline_on_prepro <- merge(x = customer_segmentation_first_prepro,
y = rfm_segments_baseline_on_prepro,
by.x = "ID",
by.y = "customer_id")
# inspect segment assignment
head(rfm_results_baseline_on_prepro)
The mainstream customer segmentation is better as our own approach since it yields much less other instances (only approximately 6.3% of donors are assigned to this segment):
rfm_results_baseline_on_prepro %>% ggplot(aes(segment)) +
geom_bar()
rfm_results_baseline_on_prepro$segment %>%
table() %>%
prop.table() %>%
round(2) %>%
sort(decreasing = T)
## .
## Loyal Customers Champions Potential Loyalist At Risk
## 0.26 0.20 0.19 0.10
## Lost About To Sleep Others Need Attention
## 0.09 0.08 0.05 0.03
rfm_results_baseline_on_prepro$segment %>%
table() %>%
prop.table() %>%
round(3) %>%
sort(decreasing = T)
## .
## Loyal Customers Champions Potential Loyalist At Risk
## 0.262 0.198 0.187 0.105
## Lost About To Sleep Others Need Attention
## 0.087 0.078 0.054 0.030
Finally, we can inspect median scores for each RFM component per segment:
rfm_plot_median_recency(rfm_results_baseline_on_prepro)
rfm_plot_median_frequency(rfm_results_baseline_on_prepro)
rfm_plot_median_monetary(rfm_results_baseline_on_prepro)